R Markdown

#packages i will need for project
library(readr)
library(tidyverse)
library(dbplyr)
library(viridis)
library(usmap)
library(modelr)
library(plotly)
library(hrbrthemes)
library(ggridges)
library(randomForest)
library(vip)
#import dataset
dir01 <- "/Users/Kalide/Downloads/countries of the world.csv"
path01 <- file.path(dir01)
df01 <- read_csv(path01)
#rename variables
df01 <- df01 %>%
  rename(Area_sq.mi=`Area (sq. mi.)`,
         pop.density_sq.mi = `Pop. Density (per sq. mi.)`,
         Coastline = `Coastline (coast/area ratio)`,
         inf.mortality_per1000births = `Infant mortality (per 1000 births)`,
         GDP_percapita = `GDP ($ per capita)`, 
         Literacy.pct = `Literacy (%)`, 
         Phones_per1000 = `Phones (per 1000)`, 
         Arable_pct = `Arable (%)`, 
         Crops_pct = `Crops (%)`, 
         Other_pct = `Other (%)`)
#Exploratory data analysis
tibble(df01)
## # A tibble: 227 × 20
##    Country           Region     Population Area_sq.mi pop.density_sq.… Coastline
##    <chr>             <chr>           <dbl>      <dbl>            <dbl>     <dbl>
##  1 Afghanistan       ASIA (EX.…   31056997     647500             48        0   
##  2 Albania           EASTERN E…    3581655      28748            125.       1.26
##  3 Algeria           NORTHERN …   32930091    2381740             13.8      0.04
##  4 American Samoa    OCEANIA         57794        199            290.      58.3 
##  5 Andorra           WESTERN E…      71201        468            152.       0   
##  6 Angola            SUB-SAHAR…   12127071    1246700              9.7      0.13
##  7 Anguilla          LATIN AME…      13477        102            132.      59.8 
##  8 Antigua & Barbuda LATIN AME…      69108        443            156       34.5 
##  9 Argentina         LATIN AME…   39921833    2766890             14.4      0.18
## 10 Armenia           C.W. OF I…    2976372      29800             99.9      0   
## # … with 217 more rows, and 14 more variables: Net migration <dbl>,
## #   inf.mortality_per1000births <dbl>, GDP_percapita <dbl>, Literacy.pct <dbl>,
## #   Phones_per1000 <dbl>, Arable_pct <dbl>, Crops_pct <dbl>, Other_pct <dbl>,
## #   Climate <dbl>, Birthrate <dbl>, Deathrate <dbl>, Agriculture <dbl>,
## #   Industry <dbl>, Service <dbl>
summary(df01)
##    Country             Region            Population          Area_sq.mi      
##  Length:227         Length:227         Min.   :7.026e+03   Min.   :       2  
##  Class :character   Class :character   1st Qu.:4.376e+05   1st Qu.:    4648  
##  Mode  :character   Mode  :character   Median :4.787e+06   Median :   86600  
##                                        Mean   :2.874e+07   Mean   :  598227  
##                                        3rd Qu.:1.750e+07   3rd Qu.:  441811  
##                                        Max.   :1.314e+09   Max.   :17075200  
##                                                                              
##  pop.density_sq.mi    Coastline      Net migration      
##  Min.   :    0.00   Min.   :  0.00   Min.   :-20.99000  
##  1st Qu.:   29.15   1st Qu.:  0.10   1st Qu.: -0.92750  
##  Median :   78.80   Median :  0.73   Median :  0.00000  
##  Mean   :  379.05   Mean   : 21.17   Mean   :  0.03812  
##  3rd Qu.:  190.15   3rd Qu.: 10.35   3rd Qu.:  0.99750  
##  Max.   :16271.50   Max.   :870.66   Max.   : 23.06000  
##                                      NA's   :3          
##  inf.mortality_per1000births GDP_percapita    Literacy.pct    Phones_per1000  
##  Min.   :  2.29              Min.   :  500   Min.   : 17.60   Min.   :   0.2  
##  1st Qu.:  8.15              1st Qu.: 1900   1st Qu.: 70.60   1st Qu.:  37.8  
##  Median : 21.00              Median : 5550   Median : 92.50   Median : 176.2  
##  Mean   : 35.51              Mean   : 9690   Mean   : 82.84   Mean   : 236.1  
##  3rd Qu.: 55.70              3rd Qu.:15700   3rd Qu.: 98.00   3rd Qu.: 389.6  
##  Max.   :191.19              Max.   :55100   Max.   :100.00   Max.   :1035.6  
##  NA's   :3                   NA's   :1       NA's   :18       NA's   :4       
##    Arable_pct      Crops_pct        Other_pct         Climate     
##  Min.   : 0.00   Min.   : 0.000   Min.   : 33.33   Min.   :1.000  
##  1st Qu.: 3.22   1st Qu.: 0.190   1st Qu.: 71.65   1st Qu.:2.000  
##  Median :10.42   Median : 1.030   Median : 85.70   Median :2.000  
##  Mean   :13.80   Mean   : 4.564   Mean   : 81.64   Mean   :2.139  
##  3rd Qu.:20.00   3rd Qu.: 4.440   3rd Qu.: 95.44   3rd Qu.:3.000  
##  Max.   :62.11   Max.   :50.680   Max.   :100.00   Max.   :4.000  
##  NA's   :2       NA's   :2        NA's   :2        NA's   :22     
##    Birthrate       Deathrate       Agriculture         Industry     
##  Min.   : 7.29   Min.   : 2.290   Min.   :0.00000   Min.   :0.0200  
##  1st Qu.:12.67   1st Qu.: 5.910   1st Qu.:0.03775   1st Qu.:0.1930  
##  Median :18.79   Median : 7.840   Median :0.09900   Median :0.2720  
##  Mean   :22.11   Mean   : 9.241   Mean   :0.15084   Mean   :0.2827  
##  3rd Qu.:29.82   3rd Qu.:10.605   3rd Qu.:0.22100   3rd Qu.:0.3410  
##  Max.   :50.73   Max.   :29.740   Max.   :0.76900   Max.   :0.9060  
##  NA's   :3       NA's   :4        NA's   :15        NA's   :16      
##     Service      
##  Min.   :0.0620  
##  1st Qu.:0.4293  
##  Median :0.5710  
##  Mean   :0.5653  
##  3rd Qu.:0.6785  
##  Max.   :0.9540  
##  NA's   :15

looks like there is a decent amount of climate, literacy, industry, service, and agriculture data missing.

#count number of NA's in file and visualize NAs
missing.values <- df01 %>%
  gather(key = "key", value = "val") %>%
  mutate(is.missing = is.na(val)) %>%
  group_by(key, is.missing) %>%
  dplyr::summarise(num.missing = n()) %>%
  filter(is.missing == T) %>%
  select(-is.missing) %>%
  arrange(desc(num.missing))


#plot the chart of NAs
missing.values %>%
  ggplot() +
  geom_bar(aes(x=reorder(key, +num.missing), y=num.missing, fill = key), 
           stat = 'identity') + 
  coord_flip() +
  labs(x='variable', 
       y="number of missing values", 
       title='Visual of Missing Values') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + theme_minimal()

omit na’s from df

df02 <- na.omit(df01)

#revised dataframe
df02
## # A tibble: 179 × 20
##    Country           Region     Population Area_sq.mi pop.density_sq.… Coastline
##    <chr>             <chr>           <dbl>      <dbl>            <dbl>     <dbl>
##  1 Afghanistan       ASIA (EX.…   31056997     647500             48        0   
##  2 Albania           EASTERN E…    3581655      28748            125.       1.26
##  3 Algeria           NORTHERN …   32930091    2381740             13.8      0.04
##  4 Anguilla          LATIN AME…      13477        102            132.      59.8 
##  5 Antigua & Barbuda LATIN AME…      69108        443            156       34.5 
##  6 Argentina         LATIN AME…   39921833    2766890             14.4      0.18
##  7 Armenia           C.W. OF I…    2976372      29800             99.9      0   
##  8 Aruba             LATIN AME…      71891        193            372.      35.5 
##  9 Australia         OCEANIA      20264082    7686850              2.6      0.34
## 10 Austria           WESTERN E…    8192880      83870             97.7      0   
## # … with 169 more rows, and 14 more variables: Net migration <dbl>,
## #   inf.mortality_per1000births <dbl>, GDP_percapita <dbl>, Literacy.pct <dbl>,
## #   Phones_per1000 <dbl>, Arable_pct <dbl>, Crops_pct <dbl>, Other_pct <dbl>,
## #   Climate <dbl>, Birthrate <dbl>, Deathrate <dbl>, Agriculture <dbl>,
## #   Industry <dbl>, Service <dbl>

Correlation of variables

The factors that correlate with GDP per capita the most are net migration, infant mortality, literacy (%), phones (per 1000), climate, Birthrate, agriculture, service

df02.cor <- subset(df02,
                   select = -c(Country, Region)) #remove non-numeric columns

ggcorrplot::ggcorrplot(cor(df02.cor), tl.cex = 12) 

Creating correlation dataframe

corr <- cor(df02.cor$GDP_percapita,df02.cor[, unlist(lapply(df02.cor,is.numeric))])
corr <- as.tibble(t(corr))

correlation <- data.frame(colnames(df02.cor[, unlist(lapply(df02.cor,is.numeric))]), corr)

correlation <- correlation %>%
  rename(Features = colnames.df02.cor...unlist.lapply.df02.cor..is.numeric...., Correlation = V1) %>%
  mutate(abs.correlation = abs(Correlation)) %>%
  arrange(desc(abs.correlation))

correlation
##                       Features Correlation abs.correlation
## 1                GDP_percapita  1.00000000      1.00000000
## 2               Phones_per1000  0.88352011      0.88352011
## 3                    Birthrate -0.65879545      0.65879545
## 4  inf.mortality_per1000births -0.63908984      0.63908984
## 5                  Agriculture -0.61691880      0.61691880
## 6                      Service  0.53655075      0.53655075
## 7                 Literacy.pct  0.52288013      0.52288013
## 8                Net migration  0.37879042      0.37879042
## 9                      Climate  0.36056651      0.36056651
## 10                   Deathrate -0.24756242      0.24756242
## 11                   Crops_pct -0.20784374      0.20784374
## 12           pop.density_sq.mi  0.19012217      0.19012217
## 13                  Area_sq.mi  0.06835595      0.06835595
## 14                   Other_pct  0.06644526      0.06644526
## 15                  Arable_pct  0.04646475      0.04646475
## 16                   Coastline  0.03581518      0.03581518
## 17                  Population -0.03361824      0.03361824
## 18                    Industry  0.03285465      0.03285465

log phones? – will be logged log arable –weak correlation. not necessary. will be dropped log agriculture – will be logged log deathrate – will be logged - not a strong predictor

#gdp looks right skewed, transform by using the logged values.

library(ggplot2)
df02 %>%
  keep(is.numeric) %>%                     # Keep only numeric columns
  gather() %>%                             # Convert to key-value pairs
  ggplot(aes(value)) +                     # Plot the values
    facet_wrap(~ key, scales = "free") +   # In separate panels
    geom_density()  +                      # as density
  theme_minimal()

ggplot(data = df02.cor, 
       aes(x = log10(GDP_percapita),
           y = `Net migration`)) +
  geom_point() + 
  geom_smooth() + geom_smooth(method = "lm", color = "red") +
  labs(x = "log10 gdp per capita",
       y = "Net migration", 
       title = "Plot of gdp per capita and net migration")

#-----will be logged, relation will improve since inf. mortality is right skewed.
ggplot(data = df02.cor, 
       aes(x = log10(GDP_percapita),
           y = inf.mortality_per1000births)) +
  geom_point() + 
  geom_smooth() + geom_smooth(method = "lm", color = "red") +
  labs(x = "log10 gdp per capita",
       y = "Infant mortality (per 1000 births)", 
       title = "Plot of gdp per capita and Infant mortality (per 1000 births)")

#-----
ggplot(data = df02.cor, 
       aes(x = log10(GDP_percapita),
           y = Literacy.pct)) +
  geom_point() + 
  geom_smooth() + geom_smooth(method = "lm", color = "red") +
  labs(x = "log10 gdp per capita",
       y = "Literacy (%)", 
       title = "Plot of gdp per capita and Literacy (%)")

#-----Will be logged to improve
ggplot(data = df02.cor, 
       aes(x = log10(GDP_percapita),
           y = Phones_per1000)) +
  geom_point() + 
  geom_smooth() + geom_smooth(method = "lm", color = "red") +
  labs(x = "log10 gdp per capita",
       y = "Phones (per 1000)", 
       title = "Plot of gdp per capita and Phones (per 1000)")

#-----
ggplot(data = df02.cor, 
       aes(x = log10(GDP_percapita),
           y = Climate, group = Climate)) +
  geom_boxplot() +
  labs(x = "log10 gdp per capita",
       y = "Climate", 
       title = "Plot of gdp per capita and Climate")

#-----Will be logged to improve
ggplot(data = df02.cor, 
       aes(x = log10(GDP_percapita),
           y = Birthrate)) +
  geom_point() + 
  geom_smooth() + geom_smooth(method = "lm", color = "red") +
  labs(x = "log10 gdp per capita",
       y = "Birthrate", 
       title = "Plot of gdp per capita and Birthrate")

#----- Will be logged to improve
ggplot(data = df02.cor, 
       aes(x = log10(GDP_percapita),
           y = Agriculture)) +
  geom_point() + 
  geom_smooth() + geom_smooth(method = "lm", color = "red") +
  labs(x = "log10 gdp per capita",
       y = "Agriculture", 
       title = "Plot of gdp per capita and Agriculture")

#-----
ggplot(data = df02.cor, 
       aes(x = log10(GDP_percapita),
           y = Service)) +
  geom_point() + 
  geom_smooth() + geom_smooth(method = "lm", color = "red") +
  labs(x = "log10 gdp per capita",
       y = "Service", 
       title = "Plot of gdp per capita and Service")

#-----
ggplot(data = df02.cor, 
       aes(x = log10(GDP_percapita),
           y = log10(inf.mortality_per1000births))) +
  geom_point() + 
  geom_smooth() + 
  geom_smooth(method = "lm", color = "red") +
  labs(x = "log10 gdp per capita",
       y = "Infant mortality (per 1000 births)", 
       title = "Plot of gdp percapita and log10 infant mortality(per1000births)")

#----- 
ggplot(data = df02.cor, 
       aes(x = log10(GDP_percapita),
           y = log10(Phones_per1000))) +
  geom_point() + 
  geom_smooth() + geom_smooth(method = "lm", color = "red") +
  labs(x = "log10 gdp per capita",
       y = "Phones (per 1000)", 
       title = "Plot of gdp per capita and log10 Phones (per 1000)")

#-----
ggplot(data = df02.cor, 
       aes(x = log10(GDP_percapita),
           y = log10(Birthrate))) +
  geom_point() + 
  geom_smooth() + geom_smooth(method = "lm", color = "red") +
  labs(x = "log10 gdp per capita",
       y = "Birthrate", 
       title = "Plot of gdp per capita and log10 Birthrate")

#-----
ggplot(data = df02.cor, 
       aes(x = log10(GDP_percapita),
           y = log10(Agriculture))) +
  geom_point() + 
  geom_smooth() + geom_smooth(method = "lm", color = "red") +
  labs(x = "log10 gdp per capita",
       y = "Agriculture", 
       title = "Plot of gdp per capita and log10 Agriculture")

#-----
ggplot(data = df02.cor, 
       aes(x = log10(GDP_percapita),
           y = log10(Population))) +
  geom_point() + 
  geom_smooth() + geom_smooth(method = "lm", color = "red") +
  labs(x = "log10 gdp per capita",
       y = "Population", 
       title = "Plot of gdp per capita and log10 Population")

#-----
ggplot(data = df02.cor, 
       aes(x = log10(GDP_percapita),
           y = log10(Area_sq.mi))) +
  geom_point() + 
  geom_smooth() + geom_smooth(method = "lm", color = "red") +
  labs(x = "log10 gdp per capita",
       y = "Area_sq.mi", 
       title = "Plot of gdp per capita and log10 Area_sq.mi")

#-----
ggplot(data = df02.cor, 
       aes(x = log10(GDP_percapita),
           y = log10(pop.density_sq.mi))) +
  geom_point() + 
  geom_smooth() + geom_smooth(method = "lm", color = "red") +
  labs(x = "log10 gdp per capita",
       y = "pop.density_sq.mi", 
       title = "Plot of gdp per capita and log10 pop.density_sq.mi")

#add logged data columns into dataframe
df03 <- df02.cor
df03 <- df03 %>%
  add_column(loggdp_percapita =  log10(df03$GDP_percapita), 
             loginf.mortality_per1000births = log10(df03$inf.mortality_per1000births),
             logBirthrate = log10(df03$Birthrate),
             logAgriculture = log10(df03$Agriculture),
             logArea_sq.mi = log10(df03$Area_sq.mi),
             logpop.density_sq.mi = log10(df03$pop.density_sq.mi))

df03
## # A tibble: 179 × 24
##    Population Area_sq.mi pop.density_sq.mi Coastline `Net migration`
##         <dbl>      <dbl>             <dbl>     <dbl>           <dbl>
##  1   31056997     647500              48        0              23.1 
##  2    3581655      28748             125.       1.26           -4.93
##  3   32930091    2381740              13.8      0.04           -0.39
##  4      13477        102             132.      59.8            10.8 
##  5      69108        443             156       34.5            -6.15
##  6   39921833    2766890              14.4      0.18            0.61
##  7    2976372      29800              99.9      0              -6.47
##  8      71891        193             372.      35.5             0   
##  9   20264082    7686850               2.6      0.34            3.98
## 10    8192880      83870              97.7      0               2   
## # … with 169 more rows, and 19 more variables:
## #   inf.mortality_per1000births <dbl>, GDP_percapita <dbl>, Literacy.pct <dbl>,
## #   Phones_per1000 <dbl>, Arable_pct <dbl>, Crops_pct <dbl>, Other_pct <dbl>,
## #   Climate <dbl>, Birthrate <dbl>, Deathrate <dbl>, Agriculture <dbl>,
## #   Industry <dbl>, Service <dbl>, loggdp_percapita <dbl>,
## #   loginf.mortality_per1000births <dbl>, logBirthrate <dbl>,
## #   logAgriculture <dbl>, logArea_sq.mi <dbl>, logpop.density_sq.mi <dbl>

re-run correlation: high correlation – logbirthrate, loginf.mortality_per1000births, agricultre,service

df03.cor <- df03 %>%
  filter(logAgriculture != "-Inf")
df03.cor <- subset(df03.cor)
ggcorrplot::ggcorrplot(cor(df03.cor), tl.cex = 12)

visualization

df02 %>%
  ggplot(aes(x= Region, y=Area_sq.mi, group = Region)) + 
  geom_violin() +
  theme(axis.text.x = element_text(color = "black", 
                                   size = 7, angle=30,
                                   vjust=.8, hjust=0.8))

df02 %>%
  ggplot(aes(x=Region, y=Birthrate, group = Region, fill = Region)) + 
  geom_violin() +
  theme(axis.text.x = element_text(color = "black", 
                                   size = 7, angle=30,
                                   vjust=.8, hjust=0.8), 
        legend.position = "none")

df02 %>%
  ggplot(aes(x=Region, y=GDP_percapita, group = Region, fill = Region)) + 
  geom_violin() +
  theme(axis.text.x = element_text(color = "black", 
                                   size = 7, angle=30,
                                   vjust=.8, hjust=0.8), 
        legend.position = "none")

df03.cor %>%
  ggplot(aes(x=Climate, y=loginf.mortality_per1000births, 
             group = Climate, fill = Climate)) + 
  geom_violin() +
  theme(axis.text.x = element_text(color = "black", 
                                   size = 7, angle=30,
                                   vjust=.8, hjust=0.8), 
        legend.position = "none") + 
  scale_fill_viridis(discrete = FALSE, option = "H") + theme_minimal()

df03$Country <- df02$Country
df03$Region <- df02$Region


df03 %>%
  ggplot(aes(x=loggdp_percapita, y=Region, fill = Region)) + 
  geom_density_ridges() + theme_ridges() + theme(legend.position = "none")

df03 %>%
  ggplot(aes(x=loggdp_percapita, y=logAgriculture, 
             size = loggdp_percapita, color = Region)) + 
  geom_point() + theme_ipsum()

df03 %>%
  ggplot(aes(x=loggdp_percapita, y=Service, 
             size = loggdp_percapita, color = Region)) + 
  geom_point() + theme_ipsum()

#pop
df03 %>%
  ggplot(aes(x = Region, fill = Region)) +
  geom_bar(aes(group = mean(Population))) + labs(x = "Regions") + 
  theme(legend.position = "none",
        axis.text.x = element_text(color = "black",
                                   size = 6, angle=30,
                                   vjust=.8, hjust=0.8))

df03 %>%
  ggplot() + 
  geom_bar(aes(reorder(x=Region, Region, function(x) - length(x)), 
           fill = Region)) +
  labs(x = "Regions") + theme(legend.position = "none", 
                              axis.text.x = element_text(color = "black",
                                                         size = 6, angle=30, 
                                                         vjust=.8, hjust=0.8))

df03 %>%
  ggplot() +
  geom_point(aes(x=GDP_percapita, y= Literacy.pct, 
                 color = as.factor(Region))) + 
  facet_wrap(Region~.) + theme_classic() + theme(legend.position = "none")

df03 %>%
  ggplot() +
  geom_point(aes(x=GDP_percapita, y= Industry, 
                 color = as.factor(Region))) + 
  facet_wrap(Region~.) + theme_classic() + theme(legend.position = "none")

df03 %>%
  ggplot() +
  geom_point(aes(x=GDP_percapita, y= Service, 
                 color = as.factor(Region))) + 
  facet_wrap(Region~.) + theme_classic() + theme(legend.position = "none")

df03 %>%
  ggplot() +
  geom_point(aes(x=GDP_percapita, y= Agriculture, 
                 color = as.factor(Region))) + 
  facet_wrap(Region~.) + theme_classic() + theme(legend.position = "none")

so, the strongest correlator to log10gdp_percapita are loginf.mortality_per1000births, phones (per 1000), birth rate, agriculture, then Literacy.pct , service, Deathrate, Climate, logArea_sq.mi, Net migration,logpop.density_sq.mi

Create absolute value correlation dataframe

fcorr <- cor(df03.cor$loggdp_percapita,df03.cor[, unlist(lapply(df03.cor,is.numeric))])
fcorr <- as.tibble(t(fcorr))


finalcorrelation <- data.frame(colnames(df03.cor[, unlist(lapply(df03.cor,is.numeric))]), fcorr)

finalcorrelation <- finalcorrelation %>%
  rename(Features = colnames.df03.cor...unlist.lapply.df03.cor..is.numeric...., Correlation = V1) %>%
  mutate(abs.correlation = abs(Correlation)) %>%
  arrange(desc(abs.correlation))

finalcorrelation
##                          Features Correlation abs.correlation
## 1                loggdp_percapita  1.00000000      1.00000000
## 2                   GDP_percapita  0.88807428      0.88807428
## 3  loginf.mortality_per1000births -0.88379315      0.88379315
## 4                  Phones_per1000  0.84648739      0.84648739
## 5                       Birthrate -0.83259356      0.83259356
## 6                    logBirthrate -0.83057113      0.83057113
## 7     inf.mortality_per1000births -0.82806953      0.82806953
## 8                  logAgriculture -0.81456340      0.81456340
## 9                     Agriculture -0.78374530      0.78374530
## 10                   Literacy.pct  0.68479871      0.68479871
## 11                        Service  0.59007464      0.59007464
## 12                      Deathrate -0.39242424      0.39242424
## 13                        Climate  0.34482417      0.34482417
## 14                  logArea_sq.mi -0.24482617      0.24482617
## 15                  Net migration  0.22612005      0.22612005
## 16           logpop.density_sq.mi  0.19993760      0.19993760
## 17                      Crops_pct -0.15236322      0.15236322
## 18              pop.density_sq.mi  0.15184349      0.15184349
## 19                       Industry  0.14573437      0.14573437
## 20                     Arable_pct  0.06439032      0.06439032
## 21                     Area_sq.mi  0.04974957      0.04974957
## 22                      Coastline  0.04571155      0.04571155
## 23                      Other_pct  0.02405177      0.02405177
## 24                     Population -0.01243059      0.01243059

Build simple model

lm01 <- lm(loggdp_percapita ~ loginf.mortality_per1000births, data = df03.cor)

summary(lm01)
## 
## Call:
## lm(formula = loggdp_percapita ~ loginf.mortality_per1000births, 
##     data = df03.cor)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.80210 -0.13103  0.00578  0.11873  0.73945 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     5.05369    0.05729   88.21   <2e-16 ***
## loginf.mortality_per1000births -0.98475    0.03930  -25.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.24 on 176 degrees of freedom
## Multiple R-squared:  0.7811, Adjusted R-squared:  0.7798 
## F-statistic:   628 on 1 and 176 DF,  p-value: < 2.2e-16
#residuals look linear, no violation of model assumptions
df03.cor %>%
  add_residuals(lm01, "resid") %>%
  ggplot(aes(sample=resid)) +
  geom_qq() +
  theme_minimal() +
  labs(x = "Theoretical Quantiles", 
       y = "Sample Quantilies", 
       title = "QQ Plot: Standardized residuals" )

# residuals look normally distributed
df03.cor %>%
  add_residuals(lm01, "resid") %>%
  ggplot(aes(x=resid)) +
  geom_histogram(bins=10) +
  labs(x="Residuals",
       title = "Distribution of residuals") +
  theme_minimal()

#residuals look random
df03.cor %>%
  add_residuals(lm01, "resid") %>%
  ggplot(aes(x=loginf.mortality_per1000births)) +
  geom_point(aes(y=resid)) +
  labs(x="Residuals",
       title = "Distribution of residuals vs. infant mortality per 1000 births") +
  theme_minimal()

See if any other variable can be added to model

#rename net migration
df03.cor <- df03.cor %>%
rename(net_migration='Net migration')

#----- random
df03.cor %>%
  add_residuals(lm01, "resid") %>%
  ggplot(aes(x=Birthrate)) +
  geom_point(aes(y=resid)) +
  labs(x="Birthrate", 
       y="Residuals",
       title = "Residuals vs. Birthrate") +
  theme_minimal()

#----- there is a strong clear negative relationship 
df03.cor %>%
  add_residuals(lm01, "resid") %>%
  ggplot(aes(x=Phones_per1000)) +
  geom_point(aes(y=resid)) +
  labs(x="Phones (per 1000 births)", 
       y="Residuals",
       title = "Residuals vs. Phones (per 1000 births)") +
  theme_minimal()

#----- there is some sort of negative relationship
df03.cor %>%
  add_residuals(lm01, "resid") %>%
  ggplot(aes(x=logAgriculture)) +
  geom_point(aes(y=resid)) +
  labs(x="Agriculture", 
       y="Residuals",
       title = "Residuals vs. Agriculture") +
  theme_minimal()

#----- looks like there is some sort of pattern but its weak. most of the residuals are concentrated in one area
df03.cor %>%
  add_residuals(lm01, "resid") %>%
  ggplot(aes(x=Literacy.pct)) +
  geom_point(aes(y=resid)) +
  labs(x="Literacy (%)", 
       y="Residuals",
       title = "Residuals vs. Literacy (%)") +
  theme_minimal()

#------ doesn't look like it adds much to model
df03.cor %>%
  add_residuals(lm01, "resid") %>%
  ggplot(aes(x=Service)) +
  geom_point(aes(y=resid)) +
  labs(x="Service", 
       y="Residuals",
       title = "Residuals vs. Service") +
  theme_minimal()

#------ doesn't look like it adds much to model
df03.cor %>%
  add_residuals(lm01, "resid") %>%
  ggplot(aes(x=Climate)) +
  geom_point(aes(y=resid)) +
  labs(x="Service", 
       y="Residuals",
       title = "Residuals vs. Service") +
  theme_minimal()

#------ doesn't look like it adds much to model
df03.cor %>%
  add_residuals(lm01, "resid") %>%
  ggplot(aes(x=logArea_sq.mi)) +
  geom_point(aes(y=resid)) +
  labs(x="Service", 
       y="Residuals",
       title = "Residuals vs. log10 Area_sq.mi") +
  theme_minimal()

#------ doesn't look like it adds much to model
df03.cor %>%
  add_residuals(lm01, "resid") %>%
  ggplot(aes(x=logpop.density_sq.mi)) +
  geom_point(aes(y=resid)) +
  labs(x="Service", 
       y="Residuals",
       title = "Residuals vs. log10 population density sq.mi") +
  theme_minimal()

#------doesn't look like it adds much to model
df03.cor %>%
  add_residuals(lm01, "resid") %>%
  ggplot(aes(x=net_migration)) +
  geom_point(aes(y=resid)) +
  labs(x="Service", 
       y="Residuals",
       title = "Residuals vs. Net Migration") +
  theme_minimal()

2nd simple model

lm02 <- lm(loggdp_percapita ~ loginf.mortality_per1000births + logAgriculture, data = df03.cor)

summary(lm02)
## 
## Call:
## lm(formula = loggdp_percapita ~ loginf.mortality_per1000births + 
##     logAgriculture, data = df03.cor)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6968 -0.1128  0.0228  0.1319  0.5280 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     4.30313    0.10685  40.271  < 2e-16 ***
## loginf.mortality_per1000births -0.69169    0.05014 -13.796  < 2e-16 ***
## logAgriculture                 -0.32584    0.04116  -7.917 2.68e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2065 on 175 degrees of freedom
## Multiple R-squared:  0.8388, Adjusted R-squared:  0.837 
## F-statistic: 455.4 on 2 and 175 DF,  p-value: < 2.2e-16

the remaining variables all look random with the exception of ogibes

#----- doesn't look like it adds much to model
df03.cor %>%
  add_residuals(lm02, "resid") %>%
  ggplot(aes(x=Birthrate)) +
  geom_point(aes(y=resid)) +
  labs(x="Birthrate", 
       y="Residuals",
       title = "Residuals vs. Birthrate") +
  theme_minimal()

#----- looks negatively correlated
df03.cor %>%
  add_residuals(lm02, "resid") %>%
  ggplot(aes(x=Phones_per1000)) +
  geom_point(aes(y=resid)) +
  labs(x="Phones per 1000", 
       y="Residuals",
       title = "Residuals vs. Phones per 1000") +
  theme_minimal()

#----- doesn't look like it adds much to model
df03.cor %>%
  add_residuals(lm02, "resid") %>%
  ggplot(aes(x=Literacy.pct)) +
  geom_point(aes(y=resid)) +
  labs(x="Literacy (%)", 
       y="Residuals",
       title = "Residuals vs. Literacy (%)") +
  theme_minimal()

#------ doesn't look like it adds much to model
df03.cor %>%
  add_residuals(lm02, "resid") %>%
  ggplot(aes(x=Service)) +
  geom_point(aes(y=resid)) +
  labs(x="Service", 
       y="Residuals",
       title = "Residuals vs. Service") +
  theme_minimal()

#------doesn't look like it adds much to model
df03.cor %>%
  add_residuals(lm02, "resid") %>%
  ggplot(aes(x=Climate)) +
  geom_point(aes(y=resid)) +
  labs(x="Service", 
       y="Residuals",
       title = "Residuals vs. Service") +
  theme_minimal()

#------ 
df03.cor %>%
  add_residuals(lm02, "resid") %>%
  ggplot(aes(x=logArea_sq.mi)) +
  geom_boxplot(aes(y=resid)) +
  labs(x="Service", 
       y="Residuals",
       title = "Residuals vs. log10 Area_sq.mi") +
  theme_minimal()

#------
df03.cor %>%
  add_residuals(lm02, "resid") %>%
  ggplot(aes(x=logpop.density_sq.mi)) +
  geom_point(aes(y=resid)) +
  labs(x="Service", 
       y="Residuals",
       title = "Residuals vs. log10 population density sq.mi") +
  theme_minimal()

#------
df03.cor %>%
  add_residuals(lm02, "resid") %>%
  ggplot(aes(x=net_migration)) +
  geom_point(aes(y=resid)) +
  labs(x="Service", 
       y="Residuals",
       title = "Residuals vs. Net Migration") +
  theme_minimal()

final simple model

lm03 <- lm(loggdp_percapita ~ Phones_per1000 + loginf.mortality_per1000births + logAgriculture, data = df03.cor)

summary(lm03)
## 
## Call:
## lm(formula = loggdp_percapita ~ Phones_per1000 + loginf.mortality_per1000births + 
##     logAgriculture, data = df03.cor)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48494 -0.10865  0.00893  0.13101  0.49842 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     3.9518192  0.1221741  32.346  < 2e-16 ***
## Phones_per1000                  0.0006478  0.0001290   5.021 1.27e-06 ***
## loginf.mortality_per1000births -0.4936898  0.0613474  -8.047 1.27e-13 ***
## logAgriculture                 -0.2721211  0.0400316  -6.798 1.63e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1936 on 174 degrees of freedom
## Multiple R-squared:  0.8592, Adjusted R-squared:  0.8568 
## F-statistic:   354 on 3 and 174 DF,  p-value: < 2.2e-16
#residuals look linear, no violation of model assumptions
df03.cor %>%
  add_residuals(lm03, "resid") %>%
  ggplot(aes(sample=resid)) +
  geom_qq() +
  theme_minimal() +
  labs(x = "Theoretical Quantiles", 
       y = "Sample Quantilies", 
       title = "QQ Plot: Standardized residuals" )

# residuals look normally distributed
df03.cor %>%
  add_residuals(lm03, "resid") %>%
  ggplot(aes(x=resid)) +
  geom_histogram(bins=30) +
  labs(x="Residuals",
       title = "Distribution of residuals") +
  theme_minimal()

kitchen sink

lm03.01 <- lm(loggdp_percapita ~. , data = df03.cor)

step(lm03.01)
## Start:  AIC=-705.03
## loggdp_percapita ~ Population + Area_sq.mi + pop.density_sq.mi + 
##     Coastline + net_migration + inf.mortality_per1000births + 
##     GDP_percapita + Literacy.pct + Phones_per1000 + Arable_pct + 
##     Crops_pct + Other_pct + Climate + Birthrate + Deathrate + 
##     Agriculture + Industry + Service + loginf.mortality_per1000births + 
##     logBirthrate + logAgriculture + logArea_sq.mi + logpop.density_sq.mi
## 
##                                  Df Sum of Sq    RSS     AIC
## - logAgriculture                  1   0.00004 2.5890 -707.03
## - Population                      1   0.00005 2.5890 -707.03
## - Coastline                       1   0.00013 2.5891 -707.02
## - Area_sq.mi                      1   0.00104 2.5900 -706.96
## - Literacy.pct                    1   0.00511 2.5941 -706.68
## - loginf.mortality_per1000births  1   0.00526 2.5943 -706.67
## - Phones_per1000                  1   0.00675 2.5957 -706.57
## - logArea_sq.mi                   1   0.00754 2.5965 -706.51
## - Industry                        1   0.01271 2.6017 -706.16
## - net_migration                   1   0.01392 2.6029 -706.08
## - Service                         1   0.01564 2.6046 -705.96
## - pop.density_sq.mi               1   0.01774 2.6067 -705.82
## - logpop.density_sq.mi            1   0.02086 2.6098 -705.60
## - Agriculture                     1   0.02418 2.6132 -705.38
## - Climate                         1   0.02805 2.6170 -705.11
## <none>                                        2.5890 -705.03
## - Arable_pct                      1   0.09539 2.6844 -700.59
## - Other_pct                       1   0.09542 2.6844 -700.59
## - Crops_pct                       1   0.09555 2.6845 -700.58
## - logBirthrate                    1   0.11648 2.7055 -699.20
## - Deathrate                       1   0.13329 2.7223 -698.10
## - inf.mortality_per1000births     1   0.22227 2.8113 -692.37
## - Birthrate                       1   0.31080 2.8998 -686.85
## - GDP_percapita                   1   1.36502 3.9540 -631.66
## 
## Step:  AIC=-707.03
## loggdp_percapita ~ Population + Area_sq.mi + pop.density_sq.mi + 
##     Coastline + net_migration + inf.mortality_per1000births + 
##     GDP_percapita + Literacy.pct + Phones_per1000 + Arable_pct + 
##     Crops_pct + Other_pct + Climate + Birthrate + Deathrate + 
##     Agriculture + Industry + Service + loginf.mortality_per1000births + 
##     logBirthrate + logArea_sq.mi + logpop.density_sq.mi
## 
##                                  Df Sum of Sq    RSS     AIC
## - Population                      1   0.00006 2.5891 -709.02
## - Coastline                       1   0.00012 2.5892 -709.02
## - Area_sq.mi                      1   0.00102 2.5901 -708.96
## - Literacy.pct                    1   0.00510 2.5941 -708.68
## - loginf.mortality_per1000births  1   0.00535 2.5944 -708.66
## - Phones_per1000                  1   0.00671 2.5957 -708.57
## - logArea_sq.mi                   1   0.00783 2.5969 -708.49
## - Industry                        1   0.01268 2.6017 -708.16
## - net_migration                   1   0.01403 2.6031 -708.07
## - Service                         1   0.01563 2.6047 -707.96
## - pop.density_sq.mi               1   0.01997 2.6090 -707.66
## - logpop.density_sq.mi            1   0.02160 2.6106 -707.55
## - Agriculture                     1   0.02437 2.6134 -707.36
## <none>                                        2.5890 -707.03
## - Climate                         1   0.03067 2.6197 -706.93
## - Arable_pct                      1   0.09572 2.6848 -702.57
## - Other_pct                       1   0.09575 2.6848 -702.56
## - Crops_pct                       1   0.09588 2.6849 -702.56
## - logBirthrate                    1   0.12050 2.7095 -700.93
## - Deathrate                       1   0.13568 2.7247 -699.94
## - inf.mortality_per1000births     1   0.22457 2.8136 -694.22
## - Birthrate                       1   0.31915 2.9082 -688.34
## - GDP_percapita                   1   1.67353 4.2626 -620.28
## 
## Step:  AIC=-709.02
## loggdp_percapita ~ Area_sq.mi + pop.density_sq.mi + Coastline + 
##     net_migration + inf.mortality_per1000births + GDP_percapita + 
##     Literacy.pct + Phones_per1000 + Arable_pct + Crops_pct + 
##     Other_pct + Climate + Birthrate + Deathrate + Agriculture + 
##     Industry + Service + loginf.mortality_per1000births + logBirthrate + 
##     logArea_sq.mi + logpop.density_sq.mi
## 
##                                  Df Sum of Sq    RSS     AIC
## - Coastline                       1   0.00012 2.5892 -711.02
## - Area_sq.mi                      1   0.00189 2.5910 -710.90
## - Literacy.pct                    1   0.00505 2.5941 -710.68
## - loginf.mortality_per1000births  1   0.00537 2.5945 -710.66
## - Phones_per1000                  1   0.00673 2.5958 -710.56
## - logArea_sq.mi                   1   0.00804 2.5971 -710.47
## - Industry                        1   0.01263 2.6017 -710.16
## - net_migration                   1   0.01398 2.6031 -710.07
## - Service                         1   0.01557 2.6047 -709.96
## - pop.density_sq.mi               1   0.02018 2.6093 -709.64
## - logpop.density_sq.mi            1   0.02382 2.6129 -709.40
## - Agriculture                     1   0.02431 2.6134 -709.36
## <none>                                        2.5891 -709.02
## - Climate                         1   0.03078 2.6199 -708.92
## - Arable_pct                      1   0.09596 2.6851 -704.55
## - Other_pct                       1   0.09600 2.6851 -704.54
## - Crops_pct                       1   0.09613 2.6852 -704.54
## - logBirthrate                    1   0.12069 2.7098 -702.92
## - Deathrate                       1   0.13576 2.7248 -701.93
## - inf.mortality_per1000births     1   0.22539 2.8145 -696.17
## - Birthrate                       1   0.32211 2.9112 -690.15
## - GDP_percapita                   1   1.68471 4.2738 -621.81
## 
## Step:  AIC=-711.02
## loggdp_percapita ~ Area_sq.mi + pop.density_sq.mi + net_migration + 
##     inf.mortality_per1000births + GDP_percapita + Literacy.pct + 
##     Phones_per1000 + Arable_pct + Crops_pct + Other_pct + Climate + 
##     Birthrate + Deathrate + Agriculture + Industry + Service + 
##     loginf.mortality_per1000births + logBirthrate + logArea_sq.mi + 
##     logpop.density_sq.mi
## 
##                                  Df Sum of Sq    RSS     AIC
## - Area_sq.mi                      1   0.00183 2.5910 -712.89
## - Literacy.pct                    1   0.00504 2.5942 -712.67
## - loginf.mortality_per1000births  1   0.00558 2.5948 -712.63
## - Phones_per1000                  1   0.00670 2.5959 -712.56
## - logArea_sq.mi                   1   0.00860 2.5978 -712.43
## - Industry                        1   0.01261 2.6018 -712.15
## - net_migration                   1   0.01401 2.6032 -712.06
## - Service                         1   0.01553 2.6047 -711.95
## - pop.density_sq.mi               1   0.02052 2.6097 -711.61
## - logpop.density_sq.mi            1   0.02385 2.6131 -711.38
## - Agriculture                     1   0.02427 2.6135 -711.36
## <none>                                        2.5892 -711.02
## - Climate                         1   0.03094 2.6202 -710.90
## - Arable_pct                      1   0.09600 2.6852 -706.54
## - Other_pct                       1   0.09603 2.6852 -706.53
## - Crops_pct                       1   0.09616 2.6854 -706.53
## - logBirthrate                    1   0.12057 2.7098 -704.92
## - Deathrate                       1   0.13565 2.7249 -703.93
## - inf.mortality_per1000births     1   0.22534 2.8146 -698.16
## - Birthrate                       1   0.32203 2.9112 -692.15
## - GDP_percapita                   1   1.71889 4.3081 -622.39
## 
## Step:  AIC=-712.89
## loggdp_percapita ~ pop.density_sq.mi + net_migration + inf.mortality_per1000births + 
##     GDP_percapita + Literacy.pct + Phones_per1000 + Arable_pct + 
##     Crops_pct + Other_pct + Climate + Birthrate + Deathrate + 
##     Agriculture + Industry + Service + loginf.mortality_per1000births + 
##     logBirthrate + logArea_sq.mi + logpop.density_sq.mi
## 
##                                  Df Sum of Sq    RSS     AIC
## - loginf.mortality_per1000births  1   0.00460 2.5956 -714.57
## - Literacy.pct                    1   0.00486 2.5959 -714.56
## - Phones_per1000                  1   0.00508 2.5961 -714.54
## - Industry                        1   0.01248 2.6035 -714.04
## - net_migration                   1   0.01374 2.6048 -713.95
## - Service                         1   0.01539 2.6064 -713.84
## - pop.density_sq.mi               1   0.01904 2.6101 -713.59
## - logArea_sq.mi                   1   0.02074 2.6118 -713.47
## - Agriculture                     1   0.02410 2.6151 -713.24
## - logpop.density_sq.mi            1   0.02428 2.6153 -713.23
## <none>                                        2.5910 -712.89
## - Climate                         1   0.02967 2.6207 -712.86
## - Arable_pct                      1   0.09741 2.6884 -708.32
## - Other_pct                       1   0.09745 2.6885 -708.32
## - Crops_pct                       1   0.09758 2.6886 -708.31
## - logBirthrate                    1   0.11955 2.7106 -706.86
## - Deathrate                       1   0.13668 2.7277 -705.74
## - inf.mortality_per1000births     1   0.22351 2.8146 -700.16
## - Birthrate                       1   0.32037 2.9114 -694.14
## - GDP_percapita                   1   1.72813 4.3192 -623.93
## 
## Step:  AIC=-714.57
## loggdp_percapita ~ pop.density_sq.mi + net_migration + inf.mortality_per1000births + 
##     GDP_percapita + Literacy.pct + Phones_per1000 + Arable_pct + 
##     Crops_pct + Other_pct + Climate + Birthrate + Deathrate + 
##     Agriculture + Industry + Service + logBirthrate + logArea_sq.mi + 
##     logpop.density_sq.mi
## 
##                               Df Sum of Sq    RSS     AIC
## - Literacy.pct                 1   0.00464 2.6003 -716.26
## - Phones_per1000               1   0.00492 2.6006 -716.24
## - net_migration                1   0.01244 2.6081 -715.72
## - Industry                     1   0.01273 2.6084 -715.70
## - Service                      1   0.01582 2.6115 -715.49
## - pop.density_sq.mi            1   0.01921 2.6149 -715.26
## - logArea_sq.mi                1   0.02170 2.6173 -715.09
## - Agriculture                  1   0.02461 2.6203 -714.89
## - logpop.density_sq.mi         1   0.02584 2.6215 -714.81
## <none>                                     2.5956 -714.57
## - Climate                      1   0.03395 2.6296 -714.26
## - Arable_pct                   1   0.09849 2.6941 -709.95
## - Other_pct                    1   0.09854 2.6942 -709.94
## - Crops_pct                    1   0.09868 2.6943 -709.93
## - Deathrate                    1   0.13581 2.7315 -707.50
## - logBirthrate                 1   0.17332 2.7690 -705.07
## - inf.mortality_per1000births  1   0.34118 2.9368 -694.59
## - Birthrate                    1   0.41478 3.0104 -690.19
## - GDP_percapita                1   2.01834 4.6140 -614.18
## 
## Step:  AIC=-716.26
## loggdp_percapita ~ pop.density_sq.mi + net_migration + inf.mortality_per1000births + 
##     GDP_percapita + Phones_per1000 + Arable_pct + Crops_pct + 
##     Other_pct + Climate + Birthrate + Deathrate + Agriculture + 
##     Industry + Service + logBirthrate + logArea_sq.mi + logpop.density_sq.mi
## 
##                               Df Sum of Sq    RSS     AIC
## - Phones_per1000               1   0.00455 2.6048 -717.95
## - Industry                     1   0.01141 2.6117 -717.48
## - net_migration                1   0.01279 2.6131 -717.38
## - Service                      1   0.01434 2.6146 -717.28
## - pop.density_sq.mi            1   0.01868 2.6190 -716.98
## - logArea_sq.mi                1   0.02127 2.6215 -716.81
## - Agriculture                  1   0.02272 2.6230 -716.71
## - logpop.density_sq.mi         1   0.02392 2.6242 -716.63
## <none>                                     2.6003 -716.26
## - Climate                      1   0.03740 2.6377 -715.72
## - Arable_pct                   1   0.09752 2.6978 -711.70
## - Other_pct                    1   0.09757 2.6978 -711.70
## - Crops_pct                    1   0.09772 2.6980 -711.69
## - Deathrate                    1   0.13284 2.7331 -709.39
## - logBirthrate                 1   0.16989 2.7702 -706.99
## - inf.mortality_per1000births  1   0.33662 2.9369 -696.59
## - Birthrate                    1   0.41679 3.0171 -691.79
## - GDP_percapita                1   2.02332 4.6236 -615.81
## 
## Step:  AIC=-717.95
## loggdp_percapita ~ pop.density_sq.mi + net_migration + inf.mortality_per1000births + 
##     GDP_percapita + Arable_pct + Crops_pct + Other_pct + Climate + 
##     Birthrate + Deathrate + Agriculture + Industry + Service + 
##     logBirthrate + logArea_sq.mi + logpop.density_sq.mi
## 
##                               Df Sum of Sq    RSS     AIC
## - Industry                     1    0.0104 2.6153 -719.23
## - net_migration                1    0.0112 2.6160 -719.18
## - Service                      1    0.0130 2.6178 -719.06
## - pop.density_sq.mi            1    0.0156 2.6204 -718.88
## - Agriculture                  1    0.0213 2.6261 -718.50
## - logpop.density_sq.mi         1    0.0246 2.6295 -718.27
## <none>                                     2.6048 -717.95
## - logArea_sq.mi                1    0.0302 2.6350 -717.89
## - Climate                      1    0.0375 2.6423 -717.40
## - Arable_pct                   1    0.0937 2.6985 -713.66
## - Other_pct                    1    0.0937 2.6986 -713.65
## - Crops_pct                    1    0.0939 2.6987 -713.64
## - Deathrate                    1    0.1290 2.7338 -711.35
## - logBirthrate                 1    0.1656 2.7705 -708.97
## - inf.mortality_per1000births  1    0.3342 2.9390 -698.46
## - Birthrate                    1    0.4122 3.0171 -693.79
## - GDP_percapita                1    4.2706 6.8754 -547.18
## 
## Step:  AIC=-719.23
## loggdp_percapita ~ pop.density_sq.mi + net_migration + inf.mortality_per1000births + 
##     GDP_percapita + Arable_pct + Crops_pct + Other_pct + Climate + 
##     Birthrate + Deathrate + Agriculture + Service + logBirthrate + 
##     logArea_sq.mi + logpop.density_sq.mi
## 
##                               Df Sum of Sq    RSS     AIC
## - net_migration                1    0.0119 2.6272 -720.43
## - pop.density_sq.mi            1    0.0164 2.6317 -720.12
## - logpop.density_sq.mi         1    0.0281 2.6434 -719.33
## <none>                                     2.6153 -719.23
## - logArea_sq.mi                1    0.0338 2.6490 -718.95
## - Climate                      1    0.0362 2.6515 -718.78
## - Service                      1    0.0389 2.6542 -718.60
## - Arable_pct                   1    0.0911 2.7064 -715.14
## - Other_pct                    1    0.0912 2.7064 -715.13
## - Crops_pct                    1    0.0913 2.7066 -715.12
## - Deathrate                    1    0.1274 2.7427 -712.76
## - logBirthrate                 1    0.1690 2.7843 -710.09
## - Agriculture                  1    0.2994 2.9147 -701.94
## - inf.mortality_per1000births  1    0.3487 2.9640 -698.96
## - Birthrate                    1    0.4112 3.0265 -695.24
## - GDP_percapita                1    4.2604 6.8756 -549.18
## 
## Step:  AIC=-720.43
## loggdp_percapita ~ pop.density_sq.mi + inf.mortality_per1000births + 
##     GDP_percapita + Arable_pct + Crops_pct + Other_pct + Climate + 
##     Birthrate + Deathrate + Agriculture + Service + logBirthrate + 
##     logArea_sq.mi + logpop.density_sq.mi
## 
##                               Df Sum of Sq    RSS     AIC
## - pop.density_sq.mi            1    0.0169 2.6440 -721.29
## - logpop.density_sq.mi         1    0.0279 2.6551 -720.55
## <none>                                     2.6272 -720.43
## - logArea_sq.mi                1    0.0350 2.6622 -720.07
## - Service                      1    0.0389 2.6660 -719.81
## - Climate                      1    0.0438 2.6709 -719.49
## - Arable_pct                   1    0.0876 2.7147 -716.59
## - Other_pct                    1    0.0876 2.7148 -716.59
## - Crops_pct                    1    0.0878 2.7150 -716.57
## - Deathrate                    1    0.1213 2.7484 -714.39
## - logBirthrate                 1    0.1636 2.7907 -711.67
## - Agriculture                  1    0.2973 2.9245 -703.34
## - inf.mortality_per1000births  1    0.3400 2.9671 -700.77
## - Birthrate                    1    0.4011 3.0283 -697.13
## - GDP_percapita                1    5.2077 7.8349 -527.93
## 
## Step:  AIC=-721.29
## loggdp_percapita ~ inf.mortality_per1000births + GDP_percapita + 
##     Arable_pct + Crops_pct + Other_pct + Climate + Birthrate + 
##     Deathrate + Agriculture + Service + logBirthrate + logArea_sq.mi + 
##     logpop.density_sq.mi
## 
##                               Df Sum of Sq    RSS     AIC
## - logpop.density_sq.mi         1    0.0156 2.6596 -722.24
## <none>                                     2.6440 -721.29
## - Service                      1    0.0327 2.6767 -721.10
## - logArea_sq.mi                1    0.0355 2.6795 -720.91
## - Climate                      1    0.0509 2.6949 -719.89
## - Arable_pct                   1    0.0884 2.7325 -717.43
## - Other_pct                    1    0.0885 2.7325 -717.43
## - Crops_pct                    1    0.0887 2.7327 -717.42
## - Deathrate                    1    0.1197 2.7637 -715.41
## - logBirthrate                 1    0.1469 2.7909 -713.66
## - Agriculture                  1    0.2895 2.9335 -704.79
## - inf.mortality_per1000births  1    0.3396 2.9837 -701.78
## - Birthrate                    1    0.3849 3.0289 -699.10
## - GDP_percapita                1    5.2272 7.8712 -529.11
## 
## Step:  AIC=-722.24
## loggdp_percapita ~ inf.mortality_per1000births + GDP_percapita + 
##     Arable_pct + Crops_pct + Other_pct + Climate + Birthrate + 
##     Deathrate + Agriculture + Service + logBirthrate + logArea_sq.mi
## 
##                               Df Sum of Sq    RSS     AIC
## - logArea_sq.mi                1    0.0206 2.6803 -722.86
## <none>                                     2.6596 -722.24
## - Service                      1    0.0318 2.6914 -722.12
## - Climate                      1    0.0459 2.7055 -721.19
## - Other_pct                    1    0.0927 2.7523 -718.14
## - Arable_pct                   1    0.0927 2.7523 -718.14
## - Crops_pct                    1    0.0929 2.7525 -718.13
## - Deathrate                    1    0.1589 2.8186 -713.91
## - logBirthrate                 1    0.1756 2.8352 -712.86
## - Agriculture                  1    0.2795 2.9392 -706.45
## - inf.mortality_per1000births  1    0.3828 3.0425 -700.30
## - Birthrate                    1    0.4191 3.0788 -698.19
## - GDP_percapita                1    5.2120 7.8716 -531.10
## 
## Step:  AIC=-722.86
## loggdp_percapita ~ inf.mortality_per1000births + GDP_percapita + 
##     Arable_pct + Crops_pct + Other_pct + Climate + Birthrate + 
##     Deathrate + Agriculture + Service + logBirthrate
## 
##                               Df Sum of Sq    RSS     AIC
## - Service                      1    0.0192 2.6995 -723.59
## <none>                                     2.6803 -722.86
## - Climate                      1    0.0483 2.7286 -721.68
## - Other_pct                    1    0.0929 2.7732 -718.79
## - Arable_pct                   1    0.0930 2.7733 -718.79
## - Crops_pct                    1    0.0931 2.7734 -718.79
## - Deathrate                    1    0.1529 2.8332 -714.99
## - logBirthrate                 1    0.1720 2.8523 -713.79
## - Agriculture                  1    0.2813 2.9616 -707.10
## - inf.mortality_per1000births  1    0.3707 3.0510 -701.80
## - Birthrate                    1    0.4201 3.1003 -698.95
## - GDP_percapita                1    5.2660 7.9463 -531.42
## 
## Step:  AIC=-723.59
## loggdp_percapita ~ inf.mortality_per1000births + GDP_percapita + 
##     Arable_pct + Crops_pct + Other_pct + Climate + Birthrate + 
##     Deathrate + Agriculture + logBirthrate
## 
##                               Df Sum of Sq    RSS     AIC
## <none>                                     2.6995 -723.59
## - Climate                      1    0.0488 2.7483 -722.40
## - Other_pct                    1    0.0972 2.7967 -719.29
## - Arable_pct                   1    0.0972 2.7968 -719.29
## - Crops_pct                    1    0.0974 2.7969 -719.28
## - Deathrate                    1    0.1567 2.8563 -715.54
## - logBirthrate                 1    0.1809 2.8804 -714.04
## - Agriculture                  1    0.2621 2.9616 -709.10
## - inf.mortality_per1000births  1    0.3547 3.0542 -703.62
## - Birthrate                    1    0.4401 3.1396 -698.71
## - GDP_percapita                1    5.2866 7.9861 -532.53
## 
## Call:
## lm(formula = loggdp_percapita ~ inf.mortality_per1000births + 
##     GDP_percapita + Arable_pct + Crops_pct + Other_pct + Climate + 
##     Birthrate + Deathrate + Agriculture + logBirthrate, data = df03.cor)
## 
## Coefficients:
##                 (Intercept)  inf.mortality_per1000births  
##                   2.716e+02                   -3.870e-03  
##               GDP_percapita                   Arable_pct  
##                   2.966e-05                   -2.686e+00  
##                   Crops_pct                    Other_pct  
##                  -2.687e+00                   -2.685e+00  
##                     Climate                    Birthrate  
##                  -3.171e-02                   -2.578e-02  
##                   Deathrate                  Agriculture  
##                   9.141e-03                   -4.289e-01  
##                logBirthrate  
##                   8.670e-01

kitchen sink model loggdp_percapita ~ net_migration + Phones_per1000 + Arable_pct + Crops_pct + Other_pct + Birthrate + Deathrate + Service + loginf.mortality_per1000births + logBirthrate + logAgriculture + logpop.density_sq.mi, data = df03.cor

model fit..of kitchen sink model

lm04 <- lm(loggdp_percapita ~ net_migration + 
    Phones_per1000 + Arable_pct + Crops_pct + Other_pct + Birthrate + 
    Deathrate + Service + loginf.mortality_per1000births  + 
    logAgriculture + logpop.density_sq.mi, data = df03.cor)

summary(lm04)
## 
## Call:
## lm(formula = loggdp_percapita ~ net_migration + Phones_per1000 + 
##     Arable_pct + Crops_pct + Other_pct + Birthrate + Deathrate + 
##     Service + loginf.mortality_per1000births + logAgriculture + 
##     logpop.density_sq.mi, data = df03.cor)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48015 -0.11478  0.01303  0.11016  0.45185 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     4.012e+02  1.454e+02   2.758 0.006466 ** 
## net_migration                   5.863e-03  3.201e-03   1.831 0.068824 .  
## Phones_per1000                  6.294e-04  1.220e-04   5.160 6.97e-07 ***
## Arable_pct                     -3.969e+00  1.455e+00  -2.729 0.007044 ** 
## Crops_pct                      -3.970e+00  1.454e+00  -2.730 0.007027 ** 
## Other_pct                      -3.970e+00  1.455e+00  -2.729 0.007032 ** 
## Birthrate                      -1.319e-02  2.312e-03  -5.702 5.30e-08 ***
## Deathrate                      -4.639e-04  3.093e-03  -0.150 0.880966    
## Service                        -9.818e-02  1.063e-01  -0.923 0.357124    
## loginf.mortality_per1000births -2.782e-01  7.191e-02  -3.869 0.000157 ***
## logAgriculture                 -2.671e-01  4.102e-02  -6.512 8.46e-10 ***
## logpop.density_sq.mi           -9.388e-02  2.710e-02  -3.464 0.000678 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1688 on 166 degrees of freedom
## Multiple R-squared:  0.8979, Adjusted R-squared:  0.8911 
## F-statistic: 132.7 on 11 and 166 DF,  p-value: < 2.2e-16
extractAIC(lm04)
## [1]   12.0000 -621.7977
df03.cor %>%
  add_residuals(lm04, "resid") %>%
  ggplot(aes(sample=resid)) +
  geom_qq() +
  theme_minimal() +
  labs(x = "Theoretical Quantiles", 
       y = "Sample Quantilies", 
       title = "QQ Plot: Standardized residuals")

df03.cor %>%
  add_residuals(lm04, "resid") %>%
  ggplot(aes(x=resid)) +
  geom_histogram(bins=30) +
  labs(x="Residuals",
       title = "Distribution of residuals") +
  theme_minimal()

5-k cross fold modeling

set.seed(2)

df04.cv <- crossv_kfold(df03.cor, k = 5)
df04.cv
## # A tibble: 5 × 3
##   train                 test                 .id  
##   <named list>          <named list>         <chr>
## 1 <resample [142 x 24]> <resample [36 x 24]> 1    
## 2 <resample [142 x 24]> <resample [36 x 24]> 2    
## 3 <resample [142 x 24]> <resample [36 x 24]> 3    
## 4 <resample [143 x 24]> <resample [35 x 24]> 4    
## 5 <resample [143 x 24]> <resample [35 x 24]> 5
cv_qda1 <- df04.cv %>%
  mutate(
    fit = purrr::map(train,
                     ~lm(loggdp_percapita ~ net_migration + loginf.mortality_per1000births + Phones_per1000 + Arable_pct + 
                           Crops_pct + Other_pct + Birthrate +  
                           Deathrate + Service + 
                           loginf.mortality_per1000births + logAgriculture +
                           logpop.density_sq.mi,
                         data = .)),
rmse = purrr::map2_dbl(fit, test, ~rmse(.x,.y)))

cv_qda1
## # A tibble: 5 × 5
##   train                 test                 .id   fit           rmse
##   <named list>          <named list>         <chr> <named list> <dbl>
## 1 <resample [142 x 24]> <resample [36 x 24]> 1     <lm>         0.201
## 2 <resample [142 x 24]> <resample [36 x 24]> 2     <lm>         0.146
## 3 <resample [142 x 24]> <resample [36 x 24]> 3     <lm>         0.193
## 4 <resample [143 x 24]> <resample [35 x 24]> 4     <lm>         0.190
## 5 <resample [143 x 24]> <resample [35 x 24]> 5     <lm>         0.152
mean(cv_qda1$rmse)
## [1] 0.1765826

build rmse function to compare rmse of models

do_dataframe_cv1 <- function(formula) {
  df04.cv %>%
    mutate(fit = map(train, ~lm(formula, data = .)),
           rmse = map2_dbl(fit, test, ~rmse(.x,.y))) %>%
    summarize(cv_rmse = mean(rmse)) %>%
    pull(cv_rmse)
}

RMSE of simple model: lm(log10gdp_percapita ~ Phones (per 1000), data = df03)

do_dataframe_cv1(loggdp_percapita ~ loginf.mortality_per1000births)
## [1] 0.2422357

RMSE of final simple model: lm(log10gdp_percapita ~ Phones (per 1000) + logInfant mortality (per 1000 births) + logAgriculture

do_dataframe_cv1(loggdp_percapita ~ loginf.mortality_per1000births + Phones_per1000 + logAgriculture)
## [1] 0.1957149

RMSE of kitchen sink model: loggdp_percapita ~ net_migration + loginf.mortality_per1000births + Phones_per1000 + Arable_pct + Crops_pct + Other_pct + Birthrate + Deathrate + Service + loginf.mortality_per1000births + logAgriculture + logpop.density_sq.mi

do_dataframe_cv1(loggdp_percapita ~ net_migration + loginf.mortality_per1000births + 
    Phones_per1000 + Arable_pct + Crops_pct + Other_pct + Birthrate + 
    Deathrate + Service + loginf.mortality_per1000births + 
    logAgriculture + logpop.density_sq.mi)
## [1] 0.1765826

greedy selection, start with creating a partion of dataset

df05.cv <- resample_partition(df03.cor,
                                    p=c(train=0.6,
                                        valid=0.20,
                                        test=0.20))

df05.cv
## $train
## <resample [106 x 24]> 1, 3, 5, 6, 7, 11, 14, 15, 16, 17, ...
## 
## $valid
## <resample [36 x 24]> 2, 4, 8, 12, 18, 22, 25, 28, 31, 38, ...
## 
## $test
## <resample [36 x 24]> 9, 10, 13, 21, 23, 24, 27, 36, 45, 50, ...

In forward selection, we begin with an empty model (no candidate variables), and at each step, we add the variable that improves the model the most.

step1 <- function(response, predictors, candidates, partition) {

rhs <- paste0(paste0(predictors, collapse = "+"), "+", candidates)
formulas <- lapply(paste0(response, "~", rhs), as.formula)
rmses <- sapply(formulas,
                function(fm) rmse(lm(fm, data = partition$train),
                                  data = partition$valid))
names(rmses) <- candidates
attr(rmses, "best") <- rmses[which.min(rmses)]
rmses
}

initalize a variable for tracking out model

model <- NULL

Step 1 (no variables): the best one, and the first one to be added as a predictor is ‘log10(infant mortality rate (per 1,000 births))’; RMSE: 0.2654632

preds <- "1"
cands <- c("Population", 
           "logArea_sq.mi", 
           "Coastline", 
           "net_migration","loginf.mortality_per1000births", 
           "Literacy.pct", "Phones_per1000","Arable_pct",
           "Crops_pct", "Other_pct","Climate","Birthrate",
           "Deathrate","logAgriculture","Industry","Service",
           "logpop.density_sq.mi")

s1 <- step1("loggdp_percapita", preds, cands, df05.cv)
model <- c(model, attr(s1, "best"))

s1
##                     Population                  logArea_sq.mi 
##                      0.4906678                      0.4555911 
##                      Coastline                  net_migration 
##                      0.7665041                      0.4643247 
## loginf.mortality_per1000births                   Literacy.pct 
##                      0.2775538                      0.3649646 
##                 Phones_per1000                     Arable_pct 
##                      0.2351127                      0.4826919 
##                      Crops_pct                      Other_pct 
##                      0.4770081                      0.4827703 
##                        Climate                      Birthrate 
##                      0.4688727                      0.2870815 
##                      Deathrate                 logAgriculture 
##                      0.4164659                      0.2434491 
##                       Industry                        Service 
##                      0.4679956                      0.4197100 
##           logpop.density_sq.mi 
##                      0.4604890 
## attr(,"best")
## Phones_per1000 
##      0.2351127

step 2 ()

preds <- "Phones_per1000"
cands <- c("Population", 
           "logArea_sq.mi", 
           "Coastline", 
           "net_migration","loginf.mortality_per1000births", 
           "Literacy.pct","Arable_pct",
           "Crops_pct", "Other_pct","Climate","Birthrate",
           "Deathrate","logAgriculture","Industry","Service",
           "logpop.density_sq.mi")

s1 <- step1("loggdp_percapita", preds, cands, df05.cv)
model <- c(model, attr(s1, "best"))

s1
##                     Population                  logArea_sq.mi 
##                      0.2392479                      0.2437711 
##                      Coastline                  net_migration 
##                      0.2485474                      0.2356203 
## loginf.mortality_per1000births                   Literacy.pct 
##                      0.2384132                      0.2325620 
##                     Arable_pct                      Crops_pct 
##                      0.2392499                      0.2335947 
##                      Other_pct                        Climate 
##                      0.2361013                      0.2356103 
##                      Birthrate                      Deathrate 
##                      0.2076721                      0.2251351 
##                 logAgriculture                       Industry 
##                      0.1850448                      0.2268648 
##                        Service           logpop.density_sq.mi 
##                      0.2389994                      0.2343216 
## attr(,"best")
## logAgriculture 
##      0.1850448
preds <- c("logAgriculture","Phones_per1000")
cands <- c("Population", 
           "logArea_sq.mi", 
           "Coastline", 
           "net_migration","loginf.mortality_per1000births", 
           "Literacy.pct","Arable_pct",
           "Crops_pct", "Other_pct","Climate","Birthrate",
           "Deathrate","Industry","Service",
           "logpop.density_sq.mi")

s1 <- step1("loggdp_percapita", preds, cands, df05.cv)
model <- c(model, attr(s1, "best"))

s1
##                     Population                  logArea_sq.mi 
##                      0.1870160                      0.1905393 
##                      Coastline                  net_migration 
##                      0.2901266                      0.1823765 
## loginf.mortality_per1000births                   Literacy.pct 
##                      0.1976153                      0.1856909 
##                     Arable_pct                      Crops_pct 
##                      0.1851106                      0.1891013 
##                      Other_pct                        Climate 
##                      0.1846467                      0.1785219 
##                      Birthrate                      Deathrate 
##                      0.1676147                      0.1823447 
##                       Industry                        Service 
##                      0.2011685                      0.1866038 
##           logpop.density_sq.mi 
##                      0.1847536 
## attr(,"best")
## Birthrate 
## 0.1676147
preds <- c("Birthrate","Phones_per1000","logAgriculture")
cands <- c("Population", 
           "logArea_sq.mi", 
           "Coastline", 
           "net_migration","loginf.mortality_per1000births", 
           "Literacy.pct","Arable_pct",
           "Crops_pct", "Other_pct","Climate",
           "Deathrate","Industry","Service",
           "logpop.density_sq.mi")

s1 <- step1("loggdp_percapita", preds, cands, df05.cv)
model <- c(model, attr(s1, "best"))

s1
##                     Population                  logArea_sq.mi 
##                      0.1666394                      0.1636918 
##                      Coastline                  net_migration 
##                      0.1909273                      0.1709369 
## loginf.mortality_per1000births                   Literacy.pct 
##                      0.1810549                      0.1726926 
##                     Arable_pct                      Crops_pct 
##                      0.1674778                      0.1691904 
##                      Other_pct                        Climate 
##                      0.1653895                      0.1704070 
##                      Deathrate                       Industry 
##                      0.1710802                      0.1766612 
##                        Service           logpop.density_sq.mi 
##                      0.1689362                      0.1531163 
## attr(,"best")
## logpop.density_sq.mi 
##            0.1531163
preds <- c("Birthrate","Phones_per1000","logAgriculture","logpop.density_sq.mi")
cands <- c("Population", 
           "logArea_sq.mi", 
           "Coastline", 
           "net_migration","loginf.mortality_per1000births", 
           "Literacy.pct","Arable_pct",
           "Crops_pct", "Other_pct","Climate",
           "Deathrate","Industry","Service")

s1 <- step1("loggdp_percapita", preds, cands, df05.cv)
model <- c(model, attr(s1, "best"))

s1
##                     Population                  logArea_sq.mi 
##                      0.1563595                      0.1523853 
##                      Coastline                  net_migration 
##                      0.1533584                      0.1577634 
## loginf.mortality_per1000births                   Literacy.pct 
##                      0.1609320                      0.1556995 
##                     Arable_pct                      Crops_pct 
##                      0.1515691                      0.1610162 
##                      Other_pct                        Climate 
##                      0.1527507                      0.1567245 
##                      Deathrate                       Industry 
##                      0.1567965                      0.1606570 
##                        Service 
##                      0.1536528 
## attr(,"best")
## Arable_pct 
##  0.1515691
preds <- c("Arable_pct","Phones_per1000","logAgriculture","logpop.density_sq.mi","Crops_pct")
cands <- c("Population", 
           "logArea_sq.mi", 
           "Coastline", 
           "net_migration","loginf.mortality_per1000births", 
           "Literacy.pct",
           "Crops_pct", "Other_pct","Climate",
           "Deathrate","Industry","Service")

s1 <- step1("loggdp_percapita", preds, cands, df05.cv)
model <- c(model, attr(s1, "best"))

s1
##                     Population                  logArea_sq.mi 
##                      0.1907350                      0.1907232 
##                      Coastline                  net_migration 
##                      0.1923280                      0.1821928 
## loginf.mortality_per1000births                   Literacy.pct 
##                      0.1769431                      0.1798873 
##                      Crops_pct                      Other_pct 
##                      0.1858573                      0.1851522 
##                        Climate                      Deathrate 
##                      0.1811446                      0.1734087 
##                       Industry                        Service 
##                      0.1924764                      0.1857436 
## attr(,"best")
## Deathrate 
## 0.1734087

cut off model at net migration

step_model <- tibble(index = seq_along(model),
                     variable = factor(names(model), levels = names(model)),
                     RMSE=model)


ggplot(step_model, aes(y = RMSE)) +
  geom_point(aes(x=variable)) +
  geom_line(aes(x=index)) +
  labs(title = "Stepwise Model Selection Plot", 
       x = "Predictor Variables") +
  theme_classic() +
  theme(axis.text.x = element_text(color = "black", 
                                   size = 9, angle=30,
                                   vjust=.8, hjust=0.8))

set.seed(4)
modelstepwise <- lm(loggdp_percapita ~  Phones_per1000 + logAgriculture + 
                      Birthrate + logpop.density_sq.mi + Arable_pct,
                    data=df05.cv$valid)

rmse(modelstepwise, df05.cv$valid)
## [1] 0.1430894
modelstepwisefinal <- lm(loggdp_percapita ~  Phones_per1000 + logAgriculture +
                           Birthrate + logpop.density_sq.mi + Arable_pct,
                         data=df05.cv$test)

rmse(modelstepwise, df05.cv$test)
## [1] 0.215647
summary(modelstepwise)
## 
## Call:
## lm(formula = loggdp_percapita ~ Phones_per1000 + logAgriculture + 
##     Birthrate + logpop.density_sq.mi + Arable_pct, data = df05.cv$valid)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.26546 -0.06345 -0.00269  0.10029  0.34787 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.7730597  0.1956711  19.283  < 2e-16 ***
## Phones_per1000        0.0009994  0.0002570   3.888 0.000519 ***
## logAgriculture       -0.3634616  0.0786657  -4.620 6.79e-05 ***
## Birthrate            -0.0148046  0.0038180  -3.878 0.000534 ***
## logpop.density_sq.mi -0.2071066  0.0727134  -2.848 0.007864 ** 
## Arable_pct            0.0036985  0.0026212   1.411 0.168533    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1567 on 30 degrees of freedom
## Multiple R-squared:  0.9117, Adjusted R-squared:  0.8969 
## F-statistic: 61.92 on 5 and 30 DF,  p-value: 6.738e-15
summary(modelstepwisefinal)
## 
## Call:
## lm(formula = loggdp_percapita ~ Phones_per1000 + logAgriculture + 
##     Birthrate + logpop.density_sq.mi + Arable_pct, data = df05.cv$test)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47776 -0.13738  0.00086  0.14894  0.26388 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.892e+00  2.357e-01  16.515  < 2e-16 ***
## Phones_per1000        9.018e-04  2.278e-04   3.958 0.000428 ***
## logAgriculture       -1.844e-01  1.056e-01  -1.747 0.090942 .  
## Birthrate            -1.775e-02  5.412e-03  -3.280 0.002635 ** 
## logpop.density_sq.mi -9.355e-02  6.693e-02  -1.398 0.172420    
## Arable_pct           -8.006e-05  3.505e-03  -0.023 0.981925    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2069 on 30 degrees of freedom
## Multiple R-squared:  0.853,  Adjusted R-squared:  0.8285 
## F-statistic: 34.83 on 5 and 30 DF,  p-value: 1.27e-11
df03.cor %>%
  add_residuals(modelstepwise, "resid") %>%
  ggplot(aes(sample=resid)) +
  geom_qq() +
  theme_minimal() +
  labs(x = "Theoretical Quantiles", 
       y = "Sample Quantilies", 
       title = "QQ Plot: Standardized residuals")

df03.cor %>%
  add_residuals(modelstepwise, "resid") %>%
  ggplot(aes(x=resid)) +
  geom_histogram(bins=30) +
  labs(x="Residuals",
       title = "Distribution of residuals") +
  theme_minimal()

best predictive model for gdp per capita

do_dataframe_cv1(loggdp_percapita ~ Phones_per1000 + loginf.mortality_per1000births + logAgriculture)
## [1] 0.1957149
#vs. 

do_dataframe_cv1(loggdp_percapita ~ Phones_per1000 + logAgriculture + Birthrate + logpop.density_sq.mi)
## [1] 0.183917
#remaining things to do: add notations to code chunks, do simple visualization for presentation,kint
lm5<- randomForest(loggdp_percapita ~., data = df03.cor, proximity = TRUE)

summary(lm5)
##                 Length Class  Mode     
## call                4  -none- call     
## type                1  -none- character
## predicted         178  -none- numeric  
## mse               500  -none- numeric  
## rsq               500  -none- numeric  
## oob.times         178  -none- numeric  
## importance         23  -none- numeric  
## importanceSD        0  -none- NULL     
## localImportance     0  -none- NULL     
## proximity       31684  -none- numeric  
## ntree               1  -none- numeric  
## mtry                1  -none- numeric  
## forest             11  -none- list     
## coefs               0  -none- NULL     
## y                 178  -none- numeric  
## test                0  -none- NULL     
## inbag               0  -none- NULL     
## terms               3  terms  call
plot(lm5)

print(lm5)
## 
## Call:
##  randomForest(formula = loggdp_percapita ~ ., data = df03.cor,      proximity = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##           Mean of squared residuals: 0.007785018
##                     % Var explained: 97.01
rmse(lm5, df05.cv$valid)
## [1] 0.04277197
mae(lm5,df05.cv$valid)
## [1] 0.03305515
varImpPlot(lm5)

importance(lm5)
##                                IncNodePurity
## Population                        0.08024075
## Area_sq.mi                        0.06820220
## pop.density_sq.mi                 0.07594269
## Coastline                         0.06389513
## net_migration                     0.74968611
## inf.mortality_per1000births       4.27324369
## GDP_percapita                    16.45243249
## Literacy.pct                      0.28473874
## Phones_per1000                    7.90083221
## Arable_pct                        0.10792763
## Crops_pct                         0.16305132
## Other_pct                         0.07920452
## Climate                           0.05784808
## Birthrate                         2.57563623
## Deathrate                         0.19325727
## Agriculture                       2.60165108
## Industry                          0.15777569
## Service                           0.23263785
## loginf.mortality_per1000births    3.72387853
## logBirthrate                      2.46254464
## logAgriculture                    3.24172447
## logArea_sq.mi                     0.07456106
## logpop.density_sq.mi              0.08767843

library(vip)

vip(modelstepwise, aesthetics = list(fill = "purple")) + theme_classic()

finaldata <- df05.cv$test$data
finaldata$gdp_percapita <- df03.cor$GDP_percapita

pred_test <- data.frame(10^(predict(modelstepwise, newdata = finaldata)), finaldata$GDP_percapita)

colnames(pred_test) <- c('Predicted_Values', 'Actual_Values')

pred_test <- mutate(pred_test, Absolute_Difference = abs(Predicted_Values - Actual_Values))

pred_test
##     Predicted_Values Actual_Values Absolute_Difference
## 1           862.5046           700           162.50461
## 2          3127.1417          4500          1372.85833
## 3          5433.5860          6000           566.41402
## 4         12355.3486          8600          3755.34862
## 5         15893.5304         11000          4893.53040
## 6          8372.0661         11200          2827.93392
## 7          4641.6092          3500          1141.60922
## 8         31883.9313         28000          3883.93133
## 9         41018.3656         29000         12018.36558
## 10        23997.5887         30000          6002.41130
## 11         3786.5588          3400           386.55876
## 12        17887.0179         16700          1187.01790
## 13        10272.7891         16900          6627.21087
## 14         1586.1098          1900           313.89016
## 15        11633.9327         15700          4066.06728
## 16        11476.7149          6100          5376.71489
## 17        23467.0068         29100          5632.99318
## 18         3574.3810          4900          1325.61899
## 19         1186.9527          1100            86.95272
## 20        41242.5121         36000          5242.51214
## 21         1463.2861          1300           163.28609
## 22         4416.5479          2400          2016.54787
## 23        10318.5481          9000          1318.54812
## 24         7786.7524          7600           186.75236
## 25        20685.0127         16000          4685.01271
## 26         7624.5249         18600         10975.47512
## 27        12929.3948          7600          5329.39476
## 28          963.3744          1100           136.62557
## 29         1917.3856          1800           117.38565
## 30          780.5352           600           180.53524
## 31         1699.3978          1900           200.60223
## 32         1341.5023          1800           458.49768
## 33         3353.1865          1400          1953.18645
## 34        44115.4844         35000          9115.48439
## 35         1604.8543          1100           504.85433
## 36         1250.1169          1200            50.11688
## 37         8694.4323          9900          1205.56768
## 38         6111.8958          5000          1111.89575
## 39         4521.2273          6300          1778.77272
## 40         1022.6582           700           322.65819
## 41          863.6166           700           163.61661
## 42         2362.1134           700          1662.11344
## 43         7052.9992          9100          2047.00084
## 44         1397.3649          1400             2.63512
## 45         6841.4175          2900          3941.41748
## 46        15736.8864         15700            36.88638
## 47        41793.2699         31100         10693.26988
## 48         1612.9964          1300           312.99642
## 49         5539.7841          5400           139.78409
## 50         3053.1773          6000          2946.82265
## 51         4593.7742          3300          1293.77422
## 52         3044.4681          4000           955.53189
## 53         3049.5680          4800          1750.43202
## 54         3707.1201          2700          1007.12005
## 55         2092.6165           700          1392.61650
## 56        16659.0837         12300          4359.08366
## 57          997.3513           700           297.35134
## 58         4197.1992          5800          1602.80076
## 59        23328.1811         27400          4071.81888
## 60        30496.2278         27600          2896.22778
## 61        12141.1103          8300          3841.11032
## 62         7855.3751         17500          9644.62486
## 63         3641.2615          5500          1858.73847
## 64         1115.6340          1700           584.36599
## 65         5099.9711          2500          2599.97109
## 66        49854.2439         27600         22254.24390
## 67         1400.0569          2200           799.94309
## 68        23046.9463         20000          3046.94626
## 69         6201.5705          5000          1201.57048
## 70         7188.0734          8000           811.92663
## 71         1896.9535          4100          2203.04650
## 72         1168.8516          2100           931.14844
## 73         1029.8544           800           229.85443
## 74         4972.7897          4000           972.78973
## 75         1104.6541          1600           495.34590
## 76         2472.9858          2600           127.01423
## 77        34112.8551         28800          5312.85505
## 78        17794.0989         13900          3894.09887
## 79        32375.0086         30900          1475.00861
## 80         2734.1284          2900           165.87157
## 81         2796.9692          3200           403.03076
## 82         6829.1101          7000           170.88994
## 83         2690.0748          1500          1190.07485
## 84        16731.1954         29600         12868.80463
## 85        12331.5133         19800          7468.48674
## 86         4239.3722          3900           339.37225
## 87        18193.8887         28200         10006.11127
## 88         5459.6771          4300          1159.67709
## 89        10032.8990          6300          3732.89905
## 90         1385.7132          1000           385.71316
## 91         2395.5542          1300          1095.55418
## 92        14275.2187         17800          3524.78125
## 93        12351.2488         19000          6648.75115
## 94         2610.6509          1600          1010.65085
## 95         1270.5097          1700           429.49029
## 96        17989.2017         10200          7789.20170
## 97         2394.5300          3000           605.47005
## 98          743.3385          1000           256.66145
## 99        18266.4173         25000          6733.58267
## 100       17817.7702         19400          1582.22981
## 101        1187.6213           800           387.62133
## 102         945.4627           600           345.46266
## 103        4341.5272          9000          4658.47277
## 104        1031.6614          3900          2868.33861
## 105         954.0438           900            54.04382
## 106        2960.5432          1600          1360.54325
## 107        8094.4480         14400          6305.55195
## 108        1985.8519          1800           185.85193
## 109        7692.8073         11400          3707.19266
## 110        7126.3163          9000          1873.68368
## 111        1934.1250          2000            65.87496
## 112        5103.3776          1800          3303.37762
## 113        1577.4165          1200           377.41655
## 114        5822.2335          7200          1377.76652
## 115       17481.0966         28600         11118.90345
## 116       15621.5322         11400          4221.53222
## 117        6889.5073         15000          8110.49274
## 118       19205.1169         21600          2394.88314
## 119        2850.6382          2300           550.63825
## 120         953.7661           800           153.76613
## 121        1149.7352           900           249.73525
## 122       27964.5069         37800          9835.49309
## 123        4476.9657         13100          8623.03428
## 124        1699.1262          2100           400.87377
## 125        9129.5242          9000           129.52419
## 126        5093.3996          6300          1206.60037
## 127        1948.1865          2200           251.81345
## 128        2548.6121          4700          2151.38786
## 129        4793.6170          5100           306.38303
## 130        2025.1319          4600          2574.86807
## 131       13898.8222         11100          2798.82219
## 132       13514.9431         18000          4485.05694
## 133       12592.7563         16800          4207.24374
## 134       23465.5849         21500          1965.58488
## 135        6398.8272          5800           598.82724
## 136        8236.6159          7000          1236.61595
## 137         893.5355          1300           406.46449
## 138       19736.0692          8800         10936.06917
## 139        5298.3792          5400           101.62082
## 140        4367.5399          2900          1467.53990
## 141        4547.2311          5600          1052.76890
## 142        1110.5380          1200            89.46197
## 143        6134.4160         11800          5665.58403
## 144        1841.5065          1600           241.50647
## 145        7635.5251          7800           164.47494
## 146         691.5215           500           191.52154
## 147         898.8130           500           398.81301
## 148        8216.7457         10700          2483.25430
## 149       19397.1190         22000          2602.88104
## 150        2589.1407          3700          1110.85932
## 151        1588.9470          1900           311.05300
## 152        8412.9916          4000          4412.99164
## 153        2489.8557          4900          2410.14430
## 154       63418.4064         26800         36618.40641
## 155       34921.7298         32700          2221.72981
## 156        2585.4414          3300           714.55858
## 157       20866.7869         23400          2533.21312
## 158        1670.6324          1000           670.63237
## 159        5192.8646          7400          2207.13541
## 160        1383.3201          1500           116.67990
## 161        2300.2709          2200           100.27085
## 162       17484.6571          9500          7984.65708
## 163        4797.4177          6900          2102.58230
## 164        6990.1774          6700           290.17739
## 165        3089.9702          5800          2710.02976
## 166         842.3678          1400           557.63217
## 167        9629.7151          5400          4229.71514
## 168       14712.2799         23200          8487.72012
## 169       38461.6690         27700         10761.66905
## 170       89126.1759         37800         51326.17588
## 171        9853.2618         12800          2946.73816
## 172        1929.0261          1700           229.02611
## 173        2726.7849          2900           173.21507
## 174        7157.0735          4800          2357.07345
## 175        3413.5589          2500           913.55887
## 176        1474.0189           800           674.01887
## 177        1563.4974           800           763.49740
## 178        2386.4932          1900           486.49322
mean(pred_test$Absolute_Difference)
## [1] 3094.235